Пример #1
0
    def test_file_input(self):
        for path_name in glob('./data/*.srt') + glob('./data/*.txt'):

            with self.subTest('test_file_input <{}>'.format(path_name)):

                matches = CnM.from_path(path_name)

                self.assertGreater(
                    len(matches),
                    0
                )

                r_ = matches.best().first()

                self.assertIsNotNone(
                    r_
                )

                if isinstance(TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)], str):
                    self.assertEqual(
                        r_.encoding,
                        TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)]
                    )
                else:
                    self.assertIn(
                        r_.encoding,
                        TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)]
                    )
Пример #2
0
def process_drive(img_burn_exe: str, drive: str, output_folder: str):
    """
    Processes drive, tests if it is ready and if it is - tries to backup
    :param img_burn_exe: path to Exe file
    :param drive: Drive from which backup will be performed
    :param output_folder: Folder to which output will be saved
    :return:
    """
    if not test_drive(drive):
        logging.info("Waiting for drive: %s to be ready", drive)
        return
    autorun_file = Path(f"{drive}Autorun.inf")
    autorun_label = ""
    if autorun_file.is_file():
        parser = ConfigParser()
        encoding = CnM.from_path(autorun_file).best().first().encoding
        logging.debug("Detected autorun.inf encoding: %s", encoding)
        try:
            parser.read_string(
                autorun_file.read_text(encoding=encoding).lower())
        except DuplicateOptionError as err:
            pass
        else:
            if 'label' in parser['autorun']:
                autorun_label = parser['autorun']['label'].upper()

    backup_disk(autorun_label, drive, img_burn_exe, output_folder)
Пример #3
0
def normalise_to_utf8(bytes_or_filepath):
    """Convert any text input with unknown encoding to utf-8.

    Parameters
    ----------
    bytes_or_filepath : bytes or str
        A binary string or path to any text file in any encoding.

    Returns
    -------
    str
        A string with correct utf-8 encoding.

    Raises
    ------
    TypeError
        Input is not of type bytes or a valid path to an existing file.
    """
    if type(bytes_or_filepath) == bytes:
        utf8_str = str(cnm.from_bytes(bytes_or_filepath).best().first())
    elif os.path.isfile(bytes_or_filepath):
        utf8_str = str(cnm.from_path(bytes_or_filepath).best().first())
    else:
        raise TypeError('Input must be bytes or a valid file path')

    return utf8_str
Пример #4
0
def _detect_file_encoding(path: Path) -> str:
    """Return an approximate encoding of text file.

    Performs an encoding detection and BOM check.

    Args:
        path: The path to playlist file

    Returns:
        A string with "best" encoding from following:
        'utf-8', 'utf-8-sig', 'cp1251', 'cp1252', 'utf_16_le'.

    Raises:
        ClickException: The file was no found or
            the encoding was not retrieved from 'charset_normalizer'
    """
    try:
        detection_result = (CnM.from_path(
            path, cp_isolation=["utf_8", "cp1252", "cp1251",
                                "utf_16_le"]).best().first())

        encoding = "utf-8"
        if path.suffix == ".aimppl4":
            encoding = "utf-16-le"
        elif detection_result.encoding == "utf_8":
            if detection_result.byte_order_mark:
                encoding = "utf-8-sig"
        else:
            encoding = detection_result.encoding

        return encoding
    except (OSError, AttributeError) as error:
        message = str(error)
        raise ClickException(message)
def encoding_from_path(txt_file_path):
    file_encoding = 'utf-8'
    enc = CnM.from_path(txt_file_path).best().first()
    file_encoding = enc.encoding
    ##fix same encoding
    if (file_encoding == 'big5' or file_encoding == 'cp1252'):
        file_encoding = 'utf-8'
    return file_encoding
Пример #6
0
def source(source_file_s):
    # TODO: file missing exception
    text_str = str(cnm.from_path(source_file_s).best().first())

    if len(text_str) == 0:
        ui.message.addItem("Файл оказался пустым!")
        ui.save_button.setVisible(False)

    return text_str
Пример #7
0
    def get_corpus(self):
        ##extract training and development datasets
        ##do data merge, ArchiveImporter make final train/test/dev datasets
        utterances = {}
        audios = []
        wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav",
                               "IT")
        text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt",
                                "IT")

        for subdir, dirs, files in os.walk(wav_dir):
            uuu = 0
            for _dir in dirs:
                curr_wav_dir = os.path.join(subdir, _dir)
                curr_txt_dir = os.path.join(text_dir, _dir)

                ##iterate wav file current folder
                for fname in os.listdir(curr_wav_dir):
                    fname = os.fsdecode(fname)

                    wav_file_path = os.path.join(wav_dir, _dir, fname)
                    txt_file_path = os.path.join(curr_txt_dir,
                                                 fname.split('.')[0] + '.txt')
                    if (not os.path.isfile(txt_file_path)):
                        print('audio file {} doesn\'t have a file transcript')
                        continue

                    ##read file transcript
                    transcript = ''

                    ##files have different encoding (utf-8, utf_16_be, etc..)
                    ##need check to open file with correct encoding
                    file_encoding = 'utf-8'
                    enc = CnM.from_path(txt_file_path).best().first()
                    file_encoding = enc.encoding
                    ##fix same encoding
                    if (file_encoding == 'big5' or file_encoding == 'cp1252'):
                        file_encoding = 'utf-8'

                    with open(txt_file_path, "r", encoding=file_encoding) as f:
                        transcript += f.readline()

                    transcript = transcript.strip()
                    ##append data manifest
                    utterances[wav_file_path] = transcript
                    audios.append(wav_file_path)

        ##collect corpus
        corpus = Corpus(utterances, audios)
        #################
        ## SIWIS clips need resample wav - clips is 44100Hz  706 kb/s (1 chnl)
        ## not require resample
        corpus.make_wav_resample = True
        return corpus
    def read_txt_file(self,txt_file_path):
        transcript = ''

        ##files have different encoding (utf-8, utf_16_be, etc..)
        ##need check to open file with correct encoding
        file_encoding ='utf-8'                   
        enc = CnM.from_path(txt_file_path).best().first()
        file_encoding = enc.encoding
        ##fix same encoding 
        if(file_encoding=='big5' or file_encoding=='cp1252' ):
            file_encoding = 'utf-8'                    

        with open(txt_file_path, "r",encoding=file_encoding) as f:
            transcript += f.readline()

        transcript = transcript.strip()
        return transcript
Пример #9
0
 def parse(self, file_name):
     content = str(CnM.from_path(file_name).best().first())
     return self.parser.parse(content)[0]
Пример #10
0

######################################################################################
def ClearScreen():
    if platform.system() == "Windows":
        os.system("cls")
    if platform.system() == "Linux":
        os.system("clear")


######################################################################################
ClearScreen()
ModelPath = easygui.fileopenbox("Selecione a legenda modelo:")
TargetPath = easygui.fileopenbox("Selecione a legenda alvo:")
######################################################################################
ModelCoding = CnM.from_path(ModelPath).best().first().encoding
TargetCoding = CnM.from_path(TargetPath).best().first().encoding
######################################################################################

ModelSub = open(ModelPath, encoding=ModelCoding)
TargetSub = open(TargetPath, encoding=TargetCoding)
ModelContent = ModelSub.readlines()
TargetContent = TargetSub.readlines()
i = 0
ModelTimesPos = []
for l in ModelContent:
    if IsSubTimeLine(l):
        ModelTimesPos.append(i)
    i += 1
i = 0
TargetTimesPos = []
Пример #11
0
args = sys.argv
enc = None
vk = None
data = []


try:
    use_cp1251, use_auth = parse_arg(args[1]), parse_arg(args[2])
except ValueError as exc:
    print(f"Неверный аргумент: {exc.args[0]}. Возможны только аргументы {POSSIBLE_ARGS}")
    exit(-1)
except IndexError:
    pass

try:
    enc = CnM.from_path(INPUT_FNAME).best().first().encoding
    if enc != 'utf-8':
        print("\n\n", ENC_WARN, "\n\n")
        if use_cp1251 is None:
            use_cp1251 = yes_no("Использовать cp1251 вместо текущей кодировки?")
        if use_cp1251:
            enc = 'cp1251'
    # parse the file with group IDs
    print("Используется кодировка: ", enc)
    with open(INPUT_FNAME, 'r', newline='', encoding=enc) as csvf:
        dialect = csv.Sniffer().sniff(csvf.read(1024))
        csvf.seek(0)
        reader = csv.reader(csvf, dialect=dialect)
        for row in reader:
            if row[0] == INPUT_FILE_HEADER_GROUP_TITLE:
                continue
Пример #12
0
    def get_data(folder_path: str) -> dict:
        """
        Iterates through a directory to create a Dict of Pandas DataFrames with
        filepaths as their keys.

        :type folder_path: str
        :rtype: dict

        keys: filepaths
        values: pd.DataFrame
        """

        # print("This is the name of the script: ", sys.argv[0])
        print("Initializing Data Retrieval...")

        csvfiles = glob.glob(folder_path + "/**/*.csv", recursive=True)
        xlfiles = glob.glob(folder_path + "/**/*.xls?", recursive=True)
        xlfiles = xlfiles + glob.glob(folder_path + "/**/*.xls", recursive=True)
        # xlfiles = []
        file_dict = {}
        i = 1

        for file in xlfiles:
            print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("\tFull Path: ", file)
            # csv_from_excel(file)
            try:
                df = pd.read_excel(file, sheet_name=None)

                for sheet in df.keys():
                    print("\t\t", sheet, "processed...")
                    df[sheet].index.rename('file_index',inplace=True)
                    file_dict.update({file.join(['', sheet]): df[sheet]})
            except:
                logging.error('COULD NOT LOAD %s' % file)
                print('\t\tFAILED')

            i += 1

        for file in csvfiles:
            # print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles)))
            print("\tFull Path: ", file)
            try:
                df = pd.read_csv(file, low_memory=False, header='infer', encoding = detect_encoding(file))
                df.index.rename('file_index',inplace=True)
                file_dict.update({file: df})
            except UnicodeDecodeError:
                try:
                    print("Encoding Detection Failed... Attempting to Normalize...")
                    normalized = StringIO(str(CnM.from_path(file).best().first()))
                    df = pd.read_csv(normalized, low_memory=False, header='infer')
                    df.index.rename('file_index',inplace=True)
                    file_dict.update({file: df})
                    print("Success!")
                except:
                    print('Encoding Normalization Failed')
            except:
                logging.error('COULD NOT LOAD %s' % file)
                print('\t\tFAILED')
            i += 1
        return file_dict
Пример #13
0
from charset_normalizer import CharsetNormalizerMatches as cnm

text_str = str(cnm.from_path("test.txt").best().first())
print(text_str)