Пример #1
0
def check_files_parsing_error(main, file_paths):
    files_parsing_error = []
    files_ok = []

    for file_path in file_paths:
        file_path = os.path.normpath(file_path)

        if os.path.splitext(file_path)[1] in ['.htm', '.html', '.tmx', '.lrc']:
            if main.settings_custom['files']['auto_detection_settings']['detect_encodings']:
                encoding, _ = wordless_detection.detect_encoding(main, file_path)
            else:
                encoding = main.settings_custom['auto_detection']['default_settings']['default_encoding']

            try:
                with open(file_path, 'r', encoding = encoding) as f:
                    for line in f:
                        pass
            except:
                files_parsing_error.append(file_path)
            else:
                files_ok.append(file_path)
        else:
            files_ok.append(file_path)

    return files_ok, files_parsing_error
Пример #2
0
def test_detection_encoding(file_name):
    file = {}

    file['path'] = f'wordless_tests/files/encodings/{file_name}'
    file['name'] = os.path.basename(file['path'])
    file['encoding'] = 'utf_8'

    encoding_code, success = wordless_detection.detect_encoding(main, file["path"])

    assert encoding_code.lower() == re.search(r'(?<=\()[^\(\)]+?(?=\)\.txt)', file_name).group().lower()
    assert success
def check_files_parsing_error(main, files):
    files_parsing_error = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file)
                    else:
                        files_ok.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_normalized_path(file_path)

                if os.path.splitext(file_path)[1] in [
                        '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wordless_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            for line in f:
                                pass
                    except:
                        files_parsing_error.append(file_path)
                    else:
                        files_ok.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_parsing_error
Пример #4
0
def test_detection_encoding(file_name):
    file = {}

    file['path'] = f'wordless_tests/files/wordless_utils/wordless_detection/encoding/{file_name}'
    file['name'] = os.path.basename(file['path'])
    file['encoding'] = 'utf_8'

    encoding_code, success = wordless_detection.detect_encoding(main, file["path"])

    encoding_code_file = re.search(r'(?<=\()[^\(\)]+?(?=\)\.txt)', file_name).group()
    encoding_code_file = encoding_code_file.lower()
    encoding_code_file = encoding_code_file.replace('-', '_')

    assert encoding_code == encoding_code_file
    assert success
Пример #5
0
    def _new_file(self, file_path):
        new_file = {}

        detection_success_encoding = True
        detection_success_text_type = True
        detection_success_lang = True

        new_file['selected'] = True
        new_file['path'] = file_path
        new_file['name'], _ = os.path.splitext(
            os.path.basename(new_file['path']))
        new_file['name_old'] = new_file['name']

        # Detect encodings
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_encodings']:
            (new_file['encoding'],
             detection_success_encoding) = wordless_detection.detect_encoding(
                 self.main, new_file['path'])
        else:
            new_file['encoding'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_encoding']

        # Detect text types
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_text_types']:
            (new_file['text_type'], detection_success_text_type
             ) = wordless_detection.detect_text_type(self.main, new_file)
        else:
            new_file['text_type'] = self.main.settings_custom[
                'auto_detection']['default_settings']['default_text_type']

        # Detect languages
        if self.main.settings_custom['files']['auto_detection_settings'][
                'detect_langs']:
            (new_file['lang'],
             detection_success_lang) = wordless_detection.detect_lang(
                 self.main, new_file)
        else:
            new_file['lang'] = self.main.settings_custom['auto_detection'][
                'default_settings']['default_lang']

        return (new_file, detection_success_encoding,
                detection_success_text_type, detection_success_lang)
Пример #6
0
def check_files_decoding_error(main, files):
    files_decoding_error = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                try:
                    with open(file['path'], 'r',
                              encoding=file['encoding']) as f:
                        for line in f:
                            pass
                except:
                    files_decoding_error.append(file)
                else:
                    files_ok.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_abs_path(file_path)

                if main.settings_custom['files']['auto_detection_settings'][
                        'detect_encodings']:
                    encoding, _ = wordless_detection.detect_encoding(
                        main, file_path)
                else:
                    encoding = main.settings_custom['auto_detection'][
                        'default_settings']['default_encoding']

                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        for line in f:
                            pass
                except:
                    files_decoding_error.append(file_path)
                else:
                    files_ok.append(file_path)

    return files_ok, files_decoding_error
Пример #7
0
    def add_files(self):
        new_files = []

        files_detection_failed_encoding = []
        files_detection_failed_text_type = []
        files_detection_failed_lang = []

        if self.file_paths:
            len_file_paths = len(self.file_paths)

            for i, file_path in enumerate(self.file_paths):
                self.progress_updated.emit(
                    self.tr(f'Loading files ... ({i + 1}/{len_file_paths})'))

                default_dir = wordless_checking_misc.check_dir(
                    self.main.settings_custom['import']['temp_files']
                    ['default_path'])
                default_encoding = self.main.settings_custom['import'][
                    'temp_files']['default_encoding']

                file_path = wordless_misc.get_abs_path(file_path)
                file_name, file_ext = os.path.splitext(
                    os.path.basename(file_path))
                file_ext = file_ext.lower()

                # Text Files
                if file_ext == '.txt':
                    (new_file, detection_success_encoding,
                     detection_success_text_type, detection_success_lang
                     ) = self.main.wordless_files._new_file(file_path)

                    new_files.append(new_file)

                    if not detection_success_encoding:
                        files_detection_failed_encoding.append(
                            new_file['path'])

                    if not detection_success_text_type:
                        files_detection_failed_text_type.append(
                            new_file['path'])

                    if not detection_success_lang:
                        files_detection_failed_lang.append(new_file['path'])
                else:
                    if file_ext in ['.docx', '.xlsx', '.xls']:
                        new_path = wordless_checking_misc.check_new_path(
                            os.path.join(default_dir, f'{file_name}.txt'))

                        # Word Documents
                        if file_ext == '.docx':
                            lines = []

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                doc = docx.Document(file_path)

                                for block in self.iter_block_items(doc):
                                    if type(block
                                            ) == docx.text.paragraph.Paragraph:
                                        f.write(f'{block.text}\n')
                                    elif type(block) == docx.table.Table:
                                        for row in self.iter_visual_cells(
                                                block):
                                            cells = []

                                            for cell in row:
                                                cells.append(' '.join([
                                                    item.text for item in
                                                    self.iter_cell_items(cell)
                                                ]))

                                            f.write('\t'.join(cells) + '\n')

                        # Excel Workbooks
                        elif file_ext == '.xlsx':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = openpyxl.load_workbook(
                                    file_path, data_only=True)

                                for worksheet_name in workbook.sheetnames:
                                    worksheet = workbook[worksheet_name]

                                    for row in worksheet.rows:
                                        f.write('\t'.join([(
                                            cell.value if cell.value != None
                                            else '') for cell in row]) + '\n')
                        elif file_ext == '.xls':
                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                workbook = xlrd.open_workbook(file_path)

                                for i_sheet in range(workbook.nsheets):
                                    worksheet = workbook.sheet_by_index(
                                        i_sheet)

                                    for row in range(worksheet.nrows):
                                        f.write('\t'.join([
                                            worksheet.cell_value(row, col)
                                            for col in range(worksheet.ncols)
                                        ]) + '\n')

                        new_paths = [new_path]
                    else:
                        # Detect encoding
                        if self.main.settings_custom['files'][
                                'auto_detection_settings']['detect_encodings']:
                            encoding_code, _ = wordless_detection.detect_encoding(
                                self.main, file_path)
                        else:
                            encoding_code = self.main.settings_custom[
                                'encoding_detection']['default_settings'][
                                    'default_encoding']

                        # CSV Files
                        if file_ext == '.csv':
                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                with open(file_path,
                                          'r',
                                          newline='',
                                          encoding=encoding_code) as f_csv:
                                    csv_reader = csv.reader(f_csv)

                                    for row in csv_reader:
                                        f.write('\t'.join(row) + '\n')

                            new_paths = [new_path]

                        # HTML Files
                        elif file_ext in ['.htm', '.html']:
                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml')

                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                f.write(soup.get_text())

                            new_paths = [new_path]

                        # Translation Memory Files
                        elif file_ext == '.tmx':
                            lines_src = []
                            lines_target = []

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                soup = bs4.BeautifulSoup(f.read(), 'lxml-xml')

                                for tu in soup.find_all('tu'):
                                    seg_src, seg_target = tu.find_all('seg')

                                    lines_src.append(seg_src.get_text())
                                    lines_target.append(seg_target.get_text())

                            path_src = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_source.txt'))
                            path_target = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir,
                                             f'{file_name}_target.txt'))

                            with open(path_src, 'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_src))
                                f.write('\n')

                            with open(path_target,
                                      'w',
                                      encoding=default_encoding) as f:
                                f.write('\n'.join(lines_target))
                                f.write('\n')

                            new_paths = [path_src, path_target]

                        # Lyrics Files
                        elif file_ext == '.lrc':
                            lyrics = {}

                            with open(file_path, 'r',
                                      encoding=encoding_code) as f:
                                for line in f:
                                    time_tags = []

                                    line = line.strip()

                                    # Strip time tags
                                    while re.search(r'^\[[^\]]+?\]', line):
                                        time_tags.append(
                                            re.search(r'^\[[^\]]+?\]',
                                                      line).group())

                                        line = line[len(time_tags[-1]):].strip(
                                        )

                                    # Strip word time tags
                                    line = re.sub(r'<[^>]+?>', r'', line)
                                    line = re.sub(r'\s{2,}', r' ',
                                                  line).strip()

                                    for time_tag in time_tags:
                                        if re.search(
                                                r'^\[[0-9]{2}:[0-5][0-9]\.[0-9]{2}\]$',
                                                time_tag):
                                            lyrics[time_tag] = line

                            new_path = wordless_checking_misc.check_new_path(
                                os.path.join(default_dir, f'{file_name}.txt'))

                            with open(new_path, 'w',
                                      encoding=default_encoding) as f:
                                for _, lyrics in sorted(lyrics.items()):
                                    f.write(f'{lyrics}\n')

                            new_paths = [new_path]

                    for new_path in new_paths:
                        (new_file, detection_success_encoding,
                         detection_success_text_type, detection_success_lang
                         ) = self.main.wordless_files._new_file(new_path)

                        new_files.append(new_file)

                        if not detection_success_encoding:
                            files_detection_failed_encoding.append(
                                new_file['path'])

                        if not detection_success_text_type:
                            files_detection_failed_text_type.append(
                                new_file['path'])

                        if not detection_success_lang:
                            files_detection_failed_lang.append(
                                new_file['path'])

            self.main.settings_custom['import']['files'][
                'default_path'] = wordless_misc.get_abs_path(
                    os.path.dirname(self.file_paths[0]))

        self.files_added.emit(new_files, files_detection_failed_encoding,
                              files_detection_failed_text_type,
                              files_detection_failed_lang)
Пример #8
0
def detect_encoding(file):
    print(f'Detect the encoding of file "{file["name"]}": ', end='')

    encoding_code, success = wordless_detection.detect_encoding(
        main, file["path"])
    print(f"{encoding_code} ({'Success' if success else 'Fail'})")
Пример #9
0
    def import_list(self, settings):
        files = []

        if os.path.exists(
                self.main.settings_custom['import'][settings]['default_path']):
            default_dir = self.main.settings_custom['import'][settings][
                'default_path']
        else:
            default_dir = self.main.settings_default['import'][settings][
                'default_path']

        file_paths = QFileDialog.getOpenFileNames(
            self.main, self.tr('Import from File(s)'), default_dir,
            self.tr('Text File (*.txt)'))[0]

        if file_paths:
            self.main.settings_custom['import'][settings][
                'default_path'] = os.path.normpath(
                    os.path.dirname(file_paths[0]))

            # Check files
            file_paths, files_empty = wordless_checking_file.check_files_empty(
                self.main, file_paths)

            if self.main.settings_custom['import'][settings][
                    'detect_encodings']:
                for file_path in file_paths:
                    files.append({
                        'path':
                        os.path.normpath(file_path),
                        'encoding':
                        wordless_detection.detect_encoding(
                            self.main, file_path)[0]
                    })
            else:
                for file_path in file_paths:
                    files.append({
                        'path':
                        os.path.normpath(file_path),
                        'encoding':
                        self.main.settings_custom['auto_detection']
                        ['default_settings']['default_encoding']
                    })

            encodings = [file['encoding'] for file in files]

            file_paths, files_loading_error = wordless_checking_file.check_files_loading_error(
                self.main, file_paths, encodings)

            if files_empty or files_loading_error:
                wordless_msg_box.wordless_msg_box_file_error_on_importing(
                    self.main,
                    files_empty=files_empty,
                    files_loading_error=files_loading_error)

                wordless_msg.wordless_msg_import_list_error(self.main)
            else:
                # Check duplicate items
                items_to_import = []
                items_cur = self.get_items()

                num_prev = len(items_cur)

                for file in files:
                    if file['path'] in file_paths:
                        with open(file['path'], 'r',
                                  encoding=file['encoding']) as f:
                            for line in f:
                                line = line.strip()

                                if line not in items_cur:
                                    items_to_import.append(line)

                self.load_items(
                    collections.OrderedDict.fromkeys(items_to_import))
                self.itemChanged.emit(self.item(0))

                wordless_msg.wordless_msg_import_list_success(
                    self.main, num_prev, len(self.get_items()))
def check_files_empty(main, files):
    files_empty = []
    files_ok = []

    if files:
        # Wordless files
        if type(files[0]) == dict:
            for file in files:
                file_path = file['path']

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    try:
                        with open(file_path, 'r',
                                  encoding=file['encoding']) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file)
                            else:
                                files_ok.append(file)
                    except:
                        files_ok.append(file)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file)
                    else:
                        files_empty.append(file)
        # File paths
        elif type(files[0]) == str:
            for file_path in files:
                file_path = wordless_misc.get_normalized_path(file_path)

                # Text files
                if os.path.splitext(file_path)[1] in [
                        '.txt', '.csv', '.htm', '.html', '.xml', '.tmx', '.lrc'
                ]:
                    if main.settings_custom['files'][
                            'auto_detection_settings']['detect_encodings']:
                        encoding, _ = wordless_detection.detect_encoding(
                            main, file_path)
                    else:
                        encoding = main.settings_custom['auto_detection'][
                            'default_settings']['default_encoding']

                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            empty_file = True

                            for line in f:
                                if line.strip():
                                    empty_file = False

                                    break

                            if empty_file:
                                files_empty.append(file_path)
                            else:
                                files_ok.append(file_path)
                    except:
                        files_ok.append(file_path)
                # Other file types
                else:
                    if os.stat(file_path).st_size:
                        files_ok.append(file_path)
                    else:
                        files_empty.append(file_path)

    return files_ok, files_empty